Importing¶

In [ ]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn.metrics import root_mean_squared_error as rmse

from tqdm.auto import tqdm

import random

import salishsea_tools.viz_tools as sa_vi

Datasets Preparation¶

In [ ]:
def datasets_preparation(dataset, dataset2):
    
    drivers = np.stack([np.ravel(dataset['Temperature_(0m-15m)']),
        np.ravel(dataset['Temperature_(15m-100m)']), np.ravel(dataset['Salinity_(0m-15m)']),
        np.ravel(dataset['Salinity_(15m-100m)']), np.ravel(dataset['Nitrate_(0m-15m)']), 
        np.ravel(dataset['Nitrate_(15m-100m)']), np.ravel(dataset2.Clusters_Drivers)])
    
    indx = np.where(~np.isnan(drivers).any(axis=0))
    drivers = drivers[:,indx[0]]

    diat = np.ravel(dataset['Diatom_Production_Rate'])
    diat = diat[indx[0]]

    return(drivers, diat, indx)

Regressor¶

In [ ]:
def regressor (inputs, targets):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs = scale.fit_transform(inputs)
    X_train, _, y_train, _ = train_test_split(inputs, targets, train_size=0.35)

    drivers = None
    diat = None
    
    inputs = None
    targets = None

    model =GradientBoostingRegressor(n_estimators=200)
    regr = BaggingRegressor(model, n_estimators=12, n_jobs=4).fit(X_train, y_train) 

    return (regr)

Regressor 2¶

In [ ]:
def regressor2 (inputs, targets, variable_name):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)

    outputs_test = regr.predict(inputs2)
   
    m = scatter_plot(targets, outputs_test, variable_name) 
    r = np.round(np.corrcoef(targets, outputs_test)[0][1],3)
    rms = rmse(targets, outputs_test)

    return (r, rms, m)

Regressor 3¶

In [ ]:
def regressor3 (inputs, targets):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)

    outputs_test = regr.predict(inputs2)
   
    # compute slope m and intercept b
    m, b = np.polyfit(targets, outputs_test, deg=1)
    
    r = np.round(np.corrcoef(targets, outputs_test)[0][1],3)
    rms = rmse(targets, outputs_test)

    return (r, rms, m)

Regressor 4¶

In [ ]:
def regressor4 (inputs, targets, variable_name):
    
    inputs = inputs.transpose()
    
    # Regressor
    scale = preprocessing.StandardScaler()
    inputs2 = scale.fit_transform(inputs)

    outputs = regr.predict(inputs2)

    # Post processing
    indx2 = np.full((len(diat_i.y)*len(diat_i.x)),np.nan)
    indx2[indx[0]] = outputs
    model = np.reshape(indx2,(len(diat_i.y),len(diat_i.x)))

    m = scatter_plot(targets, outputs, variable_name + str(dates[i].date())) 

    # Preparation of the dataarray 
    model = xr.DataArray(model,
        coords = {'y': diat_i.y, 'x': diat_i.x},
        dims = ['y','x'],
        attrs=dict( long_name = variable_name + "Concentration",
        units="mmol m-2"),)
                        
    plotting3(targets, model, diat_i, variable_name)

Printing¶

In [ ]:
def printing (targets, outputs, m):

    print ('The amount of data points is', outputs.size)
    print ('The slope of the best fitting line is ', np.round(m,3))
    print ('The correlation coefficient is:', np.round(np.corrcoef(targets, outputs)[0][1],3))
    print (' The root mean square error is:', rmse(targets,outputs))

Scatter Plot¶

In [ ]:
def scatter_plot(targets, outputs, variable_name):

    # compute slope m and intercept b
    m, b = np.polyfit(targets, outputs, deg=1)

    printing(targets, outputs, m)

    fig, ax = plt.subplots(2, figsize=(5,10), layout='constrained')

    ax[0].scatter(targets,outputs, alpha = 0.2, s = 10)

    lims = [np.min([ax[0].get_xlim(), ax[0].get_ylim()]),
        np.max([ax[0].get_xlim(), ax[0].get_ylim()])]

    # plot fitted y = m*x + b
    ax[0].axline(xy1=(0, b), slope=m, color='r')

    ax[0].set_xlabel('targets')
    ax[0].set_ylabel('outputs')
    ax[0].set_xlim(lims)
    ax[0].set_ylim(lims)
    ax[0].set_aspect('equal')

    ax[0].plot(lims, lims,linestyle = '--',color = 'k')

    h = ax[1].hist2d(targets,outputs, bins=100, cmap='jet', 
        range=[lims,lims], cmin=0.1, norm='log')
    
    ax[1].plot(lims, lims,linestyle = '--',color = 'k')

    # plot fitted y = m*x + b
    ax[1].axline(xy1=(0, b), slope=m, color='r')

    ax[1].set_xlabel('targets')
    ax[1].set_ylabel('outputs')
    ax[1].set_aspect('equal')

    fig.colorbar(h[3],ax=ax[1], location='bottom')

    fig.suptitle(variable_name)

    plt.show()

    return (m)

Plotting¶

In [ ]:
def plotting(variable, name):

    plt.plot(years,variable, marker = '.', linestyle = '')
    plt.xlabel('Years')
    plt.ylabel(name)
    plt.show()

Plotting 2¶

In [ ]:
def plotting2(variable,title):
    
    fig, ax = plt.subplots()

    scatter= ax.scatter(dates,variable, marker='.', c=pd.DatetimeIndex(dates).month)

    ax.legend(handles=scatter.legend_elements()[0], labels=['February','March','April'])
    fig.suptitle('Daily ' + title + ' (15 Feb - 30 Apr)')
    
    fig.show()

Plotting 3¶

In [ ]:
def plotting3(targets, model, variable, variable_name):

    fig, ax = plt.subplots(2,2, figsize = (10,15))

    cmap = plt.get_cmap('cubehelix')
    cmap.set_bad('gray')

    variable.plot(ax=ax[0,0], cmap=cmap, vmin = targets.min(), vmax =targets.max(), cbar_kwargs={'label': variable_name + ' Concentration  [mmol m-2]'})
    model.plot(ax=ax[0,1], cmap=cmap, vmin = targets.min(), vmax = targets.max(), cbar_kwargs={'label': variable_name + ' Concentration  [mmol m-2]'})
    ((variable-model) / variable * 100).plot(ax=ax[1,0], cmap=cmap, cbar_kwargs={'label': variable_name + ' Concentration  [percentage]'})

    plt.subplots_adjust(left=0.1,
        bottom=0.1, 
        right=0.95, 
        top=0.95, 
        wspace=0.35, 
        hspace=0.35)

    sa_vi.set_aspect(ax[0,0])
    sa_vi.set_aspect(ax[0,1])
    sa_vi.set_aspect(ax[1,0])


    ax[0,0].title.set_text(variable_name + ' (targets)')
    ax[0,1].title.set_text(variable_name + ' (outputs)')
    ax[1,0].title.set_text('targets - outputs')
    ax[1,1].axis('off')

    fig.suptitle(str(dates[i].date()))

    plt.show()
    

Training (Random Points)¶

In [ ]:
ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_model_var_old.nc')

ds = ds.isel(time_counter = (np.arange(0, len(ds.Diatom.time_counter),2)), 
    y=(np.arange(ds.y[0], ds.y[-1], 5)), 
    x=(np.arange(ds.x[0], ds.x[-1], 5)))

ds_clusters = xr.open_dataset('/data/ibougoudis/MOAD/files/clustering.nc')

ds_clusters = ds_clusters.isel(time_counter = 
    (np.arange(0, len(ds_clusters.time_counter),2)), 
    y=(np.arange(ds_clusters.y[0], ds_clusters.y[-1], 5)), 
    x=(np.arange(ds_clusters.x[0], ds_clusters.x[-1], 5)))

dates = pd.DatetimeIndex(ds['time_counter'].values)

drivers, diat, _ = datasets_preparation(ds, ds_clusters)

regr = regressor(drivers, diat)

Other Years (Anually)¶

In [ ]:
years = range (2007,2024)

r_all = []
rms_all = []
slope_all = []

for year in tqdm(range (2007,2024)):
    
    dataset = ds.sel(time_counter=str(year))
    dataset2 = ds_clusters.sel(time_counter=str(year))
    
    drivers, diat, _ = datasets_preparation(dataset, dataset2)

    r, rms, m = regressor2(drivers, diat, 'Diatom ' + str(year))
    
    r_all.append(r)
    rms_all.append(rms)
    slope_all.append(m)
    
plotting(np.transpose(r_all), 'Correlation Coefficient')
plotting(np.transpose(rms_all), 'Root Mean Square Error')
plotting (np.transpose(slope_all), 'Slope of the best fitting line')
  0%|          | 0/17 [00:00<?, ?it/s]
The amount of data points is 70794
The slope of the best fitting line is  0.516
The correlation coefficient is: 0.803
 The root mean square error is: 9.874966249035187e-07
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.558
The correlation coefficient is: 0.716
 The root mean square error is: 1.0162487528841137e-06
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.492
The correlation coefficient is: 0.744
 The root mean square error is: 1.0513249770333685e-06
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.513
The correlation coefficient is: 0.719
 The root mean square error is: 1.0609670987393483e-06
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.494
The correlation coefficient is: 0.742
 The root mean square error is: 1.107993553677641e-06
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.493
The correlation coefficient is: 0.744
 The root mean square error is: 1.0647118835704152e-06
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.535
The correlation coefficient is: 0.726
 The root mean square error is: 1.1125976591074695e-06
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.563
The correlation coefficient is: 0.727
 The root mean square error is: 1.1052115575380266e-06
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.487
The correlation coefficient is: 0.577
 The root mean square error is: 1.3307105349729625e-06
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.583
The correlation coefficient is: 0.752
 The root mean square error is: 1.0907458094796693e-06
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.573
The correlation coefficient is: 0.724
 The root mean square error is: 9.182299652729305e-07
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.656
The correlation coefficient is: 0.718
 The root mean square error is: 9.71333660638709e-07
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.647
The correlation coefficient is: 0.745
 The root mean square error is: 9.961492563649416e-07
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.473
The correlation coefficient is: 0.672
 The root mean square error is: 1.1043218822600077e-06
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.586
The correlation coefficient is: 0.808
 The root mean square error is: 9.019614607185168e-07
No description has been provided for this image
The amount of data points is 68931
The slope of the best fitting line is  0.55
The correlation coefficient is: 0.719
 The root mean square error is: 9.821233779038708e-07
No description has been provided for this image
The amount of data points is 70794
The slope of the best fitting line is  0.522
The correlation coefficient is: 0.715
 The root mean square error is: 1.0580659296622051e-06
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Other Years (Daily)¶

In [ ]:
r_all2 = np.array([])
rms_all2 = np.array([])
slope_all2 = np.array([])

for i in tqdm(range (0, len(ds.time_counter))):
    
    dataset = ds.isel(time_counter=i)
    dataset2 = ds_clusters.isel(time_counter=i)

    drivers, diat, _ = datasets_preparation(dataset, dataset2)

    r, rms, m = regressor3(drivers, diat)

    r_all2 = np.append(r_all2,r)
    rms_all2 = np.append(rms_all2,rms)
    slope_all2 = np.append(slope_all2,m)

plotting2(r_all2, 'Correlation Coefficients')
plotting2(rms_all2, 'Root Mean Square Errors')
plotting2(slope_all2, 'Slope of the best fitting line')
  0%|          | 0/640 [00:00<?, ?it/s]
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Daily Maps¶

In [ ]:
maps = random.sample(range(0,len(ds.time_counter)),10)

for i in tqdm(maps):

    dataset = ds.isel(time_counter=i)
    dataset2 = ds_clusters.isel(time_counter=i)
    drivers, diat, indx = datasets_preparation(dataset, dataset2)

    diat_i = dataset['Diatom_Production_Rate']

    regressor4(drivers, diat, 'Diatom Production Rate ')
  0%|          | 0/10 [00:00<?, ?it/s]
The amount of data points is 1863
The slope of the best fitting line is  0.791
The correlation coefficient is: 0.476
 The root mean square error is: 2.280910790166876e-06
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.496
The correlation coefficient is: 0.301
 The root mean square error is: 1.5001502360800124e-06
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.671
The correlation coefficient is: 0.552
 The root mean square error is: 1.8583424030765324e-06
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  1.136
The correlation coefficient is: 0.596
 The root mean square error is: 1.4504210739102311e-06
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.854
The correlation coefficient is: 0.79
 The root mean square error is: 7.621079363244066e-07
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.761
The correlation coefficient is: 0.775
 The root mean square error is: 9.39348892303023e-07
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.49
The correlation coefficient is: 0.386
 The root mean square error is: 8.90808284924846e-07
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.805
The correlation coefficient is: 0.764
 The root mean square error is: 1.67211167236913e-06
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.342
The correlation coefficient is: 0.375
 The root mean square error is: 1.0435722370772418e-06
No description has been provided for this image
No description has been provided for this image
The amount of data points is 1863
The slope of the best fitting line is  0.791
The correlation coefficient is: 0.66
 The root mean square error is: 1.019277914859933e-06
No description has been provided for this image
No description has been provided for this image
In [ ]: